# Calculate total generation by BA, region, interconnection
# Also broken out by dispatchable/non-dispatchable
library(pacman)
p_load(
  here, fst, data.table, readxl, janitor, lubridate,
  collapse, purrr, ggplot2, stringr
)
theme_set(theme_minimal())

# First for balancing authorities --------------------------------------------
ba_dt = 
  read_xlsx(
    here("Data/electricity-generation/eia-nges/EIA930_Reference_Tables.xlsx")
  ) |> 
  clean_names() |>
  data.table()

# The unit info dt
unit_info_dt = read.fst(
  here("Data/electricity-generation/unit-info-dt.fst"),
  as.data.table = TRUE
)

# Reading the region EIA data
ba_netgen_dt = 
  read.fst(
    here("Data/electricity-generation/eia-nges/eia-nges-ba-raw.fst"),
    as.data.table = TRUE
  )

# Creating a table with every hour for each BA code and gen type in it 
all_ba_hour_dt = 
  CJ(
    ba = unique(ba_netgen_dt$ba),
    gen_type = unique(ba_netgen_dt$gen_type),
    date = seq(
      min(ba_netgen_dt$date),
      max(ba_netgen_dt$date), 
      by = 'hour'
    )
  ) |>
  setkey(date, ba, gen_type)

# Making sure every unit has a row for every hour 
ba_netgen_dt = 
  merge(
    all_ba_hour_dt,
    ba_netgen_dt, 
    by = c("ba","gen_type","date"),
    all.x = TRUE
  )  |>
  merge(
    ba_dt[,.(ba = ba_code, region = region_country_code)], 
    by = "ba"
  )

# Accounting for anomalous values: see "IMPUTATION" here
# https://www.eia.gov/electricity/gridmonitor/about
# - If value is missing or more than 1.5 times maximum of previous then 
#   1. Fill in with value from previous hour
#   2. Fill in with value from same hour one day prior

# Adding 1hr and 1 day lags
ba_netgen_1h_dt = ba_netgen_dt[,.(
  ba, gen_type, date = date + hours(1), value_lag_1h = value
)]
ba_netgen_1d_dt = ba_netgen_dt[,.(
  ba, gen_type, date = date + days(1), value_lag_1d = value
)]
# Take 99.7th percentile for each ba/gen_type
ba_netgen_99p = ba_netgen_dt[,.(
  value_99p = quantile(value, 0.997, na.rm = TRUE)
), by = .(ba,gen_type)]

ba_netgen_dt = 
  merge(
    ba_netgen_dt[,-c("value_lag_1h","value_lag_1d", "value_99p")], 
    ba_netgen_1h_dt,
    by = c("ba","gen_type","date"),
    all.x = TRUE
  ) |>
  merge(
    ba_netgen_1d_dt,
    by = c("ba","gen_type","date"),
    all.x = TRUE
  ) |>
  merge(
    ba_netgen_99p,
    by = c("ba","gen_type"),
    all.x = TRUE
  ) %>% 
  .[,':='(
    value_clean = fcase(
      !is.na(value) & value > 0 & value < 1.5*value_99p, value,
      !is.na(value_lag_1h) & value_lag_1h > 0 & value_lag_1h < 1.5*value_99p, value_lag_1h,
      !is.na(value_lag_1d) & value_lag_1d > 0 & value_lag_1d < 1.5*value_99p, value_lag_1d,
      default = 0
    ),
    value_imp_flag = fcase(
      !is.na(value) & value > 0 & value < 1.5*value_99p, 0,
      !is.na(value_lag_1h) & value_lag_1h > 0 & value_lag_1h < 1.5*value_99p, 1,
      !is.na(value_lag_1d) & value_lag_1d > 0 & value_lag_1d < 1.5*value_99p, 1,
      default = 0
    )
  )]

# Percent of records affected by data cleaning
ba_netgen_dt[,mean(value_imp_flag)]

plot_data_cleaning = function(ba_code){
   raw_plot = 
      ba_netgen_dt[ba == ba_code] |>
      ggplot(aes(x = date, y = value)) + geom_line() + 
      geom_line(aes(y = 1.5*value_99p), color = 'red', linetype = 'dashed')+ 
      facet_wrap(~gen_type, scales = 'free') + 
      labs(
        title = paste0("Raw values for ",ba_code),
        x = "Date", y = "Raw Value"
      )
   
   upd_plot = 
      ba_netgen_dt[ba == ba_code] |>
      ggplot(aes(x = date, y = value_clean)) + geom_line() + 
      geom_line(aes(y = 1.5*value_99p), color = 'red', linetype = 'dashed')+ 
      facet_wrap(~gen_type, scales = 'free')+ 
     labs(
       title = paste0("Cleaned values for ",ba_code),
       x = "Date", y = "Cleaned Value"
     )
    
  print(raw_plot)
  print(upd_plot)
   
}

#plot_data_cleaning(ba_code = "ERCO")
#plot_data_cleaning(ba_code = "BANC")

# Calculating load by balancing authority
ba_load_dt = 
  ba_netgen_dt[,.(
    load = sum(value_clean, na.rm = TRUE),
    load_nd = sum(ifelse(gen_type %in% c("wind","solar"), value_clean, 0), na.rm = TRUE),
    load_d = sum(ifelse(!(gen_type %in% c("wind","solar")), value_clean, 0), na.rm = TRUE),
    load_ff = sum(ifelse(gen_type %in% c("coal","gas","oil"), value_clean, 0), na.rm = TRUE),
    load_solar = sum(ifelse(gen_type %in% c("solar"), value_clean, 0), na.rm = TRUE),
    load_hydro = sum(ifelse(gen_type %in% c("hydro"), value_clean, 0), na.rm = TRUE),
    load_coal = sum(ifelse(gen_type %in% c("coal"), value_clean, 0), na.rm = TRUE),
    load_gas = sum(ifelse(gen_type %in% c("gas"), value_clean, 0), na.rm = TRUE)
  ),
  keyby = .(ba_code = ba, eia_region = region, utc_time = date)
  ][,':='(
    # EIA report data as hour ending, EPA as hour starting so changing hours here
    utc_time = utc_time - hours(1), 
    excess_load = load - load_nd
  )]

# Saving balancing authority load data
write.fst(
  ba_load_dt, 
  here("Data/electricity-generation/clean-load/ba-load-dt.fst")
)


# Aggregating BA to region ---------------------------------------------------
ba_nerc_xwalk = fread(here('Data/electricity-generation/ba-nerc-xwalk.csv'))

# Aggregating to nerc region
nerc_load_dt =
  merge(
    ba_load_dt,
    ba_nerc_xwalk,
    by = c("ba_code"),
    all.x = TRUE
  )[,
    lapply(.SD, sum, na.rm = TRUE),
    keyby = .(nerc_adj, utc_time),
    .SDcols = str_subset(colnames(ba_load_dt), "load")
  ]

write.fst(
  x = nerc_load_dt,
  path = here("Data/electricity-generation/clean-load/nerc-load-dt.fst")
)


# For the EIA regions, which we won't really use 
region_load_dt = 
  ba_load_dt[,.(
    load = sum(load, na.rm = TRUE), 
    load_nd = sum(load_nd, na.rm = TRUE), 
    load_d = sum(load_d, na.rm = TRUE),
    load_ff = sum(load_ff, na.rm = TRUE),
    excess_load = sum(excess_load, na.rm = TRUE)),
    by = .(eia_region, utc_time)
  ]

# Saving region by load data
write.fst(
  region_load_dt, 
  here("Data/electricity-generation/clean-load/region-load-dt.fst")
)

# Aggregating BA to interconnection -------------------------------------------
ba_load_dt[,ic := fcase(
  eia_region %in% c("CAL", "NW", "SW") 
    | ba_code %in% c("AESO", "BCHA","CEN","CFE"), 
  "west",
  eia_region %in% c("CAR","CENT","FLA","MIDA","MIDW","NE","NY", "SE", "TEN")
    | ba_code %in% c("SPC","IESO", "MHEB","NBSO"),
  "east",
  eia_region == "TEX", "texas"
)]

ic_load_dt = 
  ba_load_dt[,.(
    load = sum(load), 
    load_nd = sum(load_nd), 
    load_d = sum(load_d),
    load_ff = sum(load_ff),
    excess_load = sum(excess_load)),
    by = .(ic, utc_time)
  ]

# Saving interconnection load data
write.fst(
  ic_load_dt, 
  here("Data/electricity-generation/clean-load/ic-load-dt.fst")
)
 
